library(tidyverse)
replacing previous import ‘vctrs::data_frame’ by ‘tibble::data_frame’ when loading ‘dplyr’Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2     ✓ purrr   0.3.4
✓ tibble  3.0.3     ✓ dplyr   1.0.0
✓ tidyr   1.1.0     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
package ‘ggplot2’ was built under R version 3.6.2package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2package ‘dplyr’ was built under R version 3.6.2── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

house_data <- read_csv("data/kc_house_data.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  id = col_character(),
  date = col_datetime(format = "")
)
See spec(...) for full column specifications.

CODECLAN SOILUTION

houses <- read_csv("data/kc_house_data.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  id = col_character(),
  date = col_datetime(format = "")
)
See spec(...) for full column specifications.
glimpse(houses)
Rows: 21,613
Columns: 21
$ id            <chr> "7129300520", "6414100192", "5631500400", "2487200875", "1954400510", "723…
$ date          <dttm> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-02-18, 2014-05-12, 2…
$ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 3…
$ bedrooms      <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3,…
$ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.…
$ sqft_living   <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 143…
$ sqft_lot      <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000,…
$ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5,…
$ waterfront    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ view          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0,…
$ condition     <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4,…
$ grade         <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8…
$ sqft_above    <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430…
$ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0…
$ yr_built      <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 19…
$ yr_renovated  <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ zipcode       <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, 98146, 98038, 9800…
$ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47…
$ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122…
$ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2390, 2210, 1330, 17…
$ sqft_lot15    <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, 7570, 8925, 6000, …
library(modelr)
house_data

Tidy up the data ready for regression:

You might like to think about removing some or all of date, id, sqft_living15, sqft_lot15 and zipcode (lat and long provide a better measure of location in any event).

house_data_tidy <- house_data %>%
  select(-c("date", "id", "sqft_living15", "sqft_lot15", "zipcode"))

house_data_tidy

CODECLAN SOLUTION

# tidy up data. In particular treat condition and grade as factor, as they are
# ordinal categorical
houses_tidy <- houses %>%
  select(-c("id", "date", "sqft_living15", "sqft_lot15", "zipcode")) %>%
  mutate(waterfront = as.logical(waterfront)) %>%
  mutate(renovated = yr_renovated != 0) %>%
  select(-"yr_renovated") %>%
  mutate(condition = as_factor(condition)) %>%
  mutate(grade = as_factor(grade))

glimpse(houses_tidy)
Rows: 21,613
Columns: 16
$ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 3…
$ bedrooms      <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3,…
$ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.…
$ sqft_living   <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 143…
$ sqft_lot      <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000,…
$ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5,…
$ waterfront    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ view          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0,…
$ condition     <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4,…
$ grade         <fct> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8…
$ sqft_above    <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430…
$ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0…
$ yr_built      <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 19…
$ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47…
$ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122…
$ renovated     <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…

Have a think about how to treat waterfront. Should we convert its type?

We converted yr_renovated into a renovated logical variable, indicating whether the property had ever been renovated. You may wish to do the same.

Have a think about how to treat condition and grade? Are they interval or categorical ordinal data types?

Check for aliased variables using the alias() function (this takes in a formula object and a data set). [Hint - formula price ~ . says ‘price varying with all predictors’, this is a suitable input to alias()]. Remove variables that lead to an alias. Check the ‘Elements of multiple regression’ lesson for a dropdown containing further information on finding aliased variables in a dataset.

library(modelr)
package ‘modelr’ was built under R version 3.6.2
alias(lm(price~ ., data = house_data_tidy))
Model :
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
    waterfront + view + condition + grade + sqft_above + sqft_basement + 
    yr_built + yr_renovated + lat + long

Complete :
              (Intercept) bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition
sqft_basement  0           0        0         1           0        0      0          0    0       
              grade sqft_above yr_built yr_renovated lat long
sqft_basement  0    -1          0        0            0   0  
house_price_tidy <- house_data_tidy %>%
  select(-c("sqft_living", "sqft_above"))

CODECLAN SOLUTION

# Alias is useful to check if we have aliased variables, i.e. one or more
# variables that can be computed from other variables
alias(price ~ ., data = houses_tidy)
Model :
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
    waterfront + view + condition + grade + sqft_above + sqft_basement + 
    yr_built + lat + long + renovated

Complete :
              (Intercept) bedrooms bathrooms sqft_living sqft_lot floors waterfrontTRUE view
sqft_basement  0           0        0         1           0        0      0              0  
              condition2 condition3 condition4 condition5 grade3 grade4 grade5 grade6 grade7
sqft_basement  0          0          0          0          0      0      0      0      0    
              grade8 grade9 grade10 grade11 grade12 grade13 sqft_above yr_built lat long
sqft_basement  0      0      0       0       0       0      -1          0        0   0  
              renovatedTRUE
sqft_basement  0           
# seems that sqft_basement can be computed from sqft_living - sqft_above.
# let's drop sqft_living leaving just the two contributions sqft_basement and 
# sqft_above
houses_tidy <- houses_tidy %>%
  select(-c("sqft_living"))
Error: Can't subset columns that don't exist.
x Column `sqft_living` doesn't exist.

Systematically build a regression model containing up to four main effects (remember, a main effect is just a single predictor with coefficient), testing the regression diagnostics as you go

splitting datasets into numeric and non-numeric columns might help ggpairs() run in manageable time, although you will need to add either a price or resid column to the non-numeric dataframe in order to see its correlations with the non-numeric predictors.

library(GGally)
package ‘GGally’ was built under R version 3.6.2Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
houses_tidy_numeric <- house_price_tidy %>%
  select_if(is.numeric)

houses_tidy_nonnumeric <- house_price_tidy %>%
  select_if(function(x) !is.numeric(x))

houses_tidy_nonnumeric$price <- house_price_tidy$price

ggpairs(houses_tidy_numeric)

ggpairs(houses_tidy_nonnumeric)

mod1a <- lm(price ~ grade, data = houses_tidy_numeric)

mod1a

Call:
lm(formula = price ~ grade, data = houses_tidy_numeric)

Coefficients:
(Intercept)        grade  
   -1056045       208458  
summary(mod1a)

Call:
lm(formula = price ~ grade, data = houses_tidy_numeric)

Residuals:
    Min      1Q  Median      3Q     Max 
-816988 -151958  -36158   97842 6046097 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1056045      12256  -86.17   <2e-16 ***
grade         208458       1582  131.76   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 273400 on 21611 degrees of freedom
Multiple R-squared:  0.4455,    Adjusted R-squared:  0.4454 
F-statistic: 1.736e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(mod1a)

mod2a <- lm(price ~ condition, data = house_data_tidy)

mod2a

Call:
lm(formula = price ~ condition, data = house_data_tidy)

Coefficients:
(Intercept)    condition  
     470147        20514  
summary(mod2a)

Call:
lm(formula = price ~ condition, data = house_data_tidy)

Residuals:
    Min      1Q  Median      3Q     Max 
-463203 -217203  -87203  101797 7147797 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   470147      13312  35.318  < 2e-16 ***
condition      20514       3835   5.349 8.94e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 366900 on 21611 degrees of freedom
Multiple R-squared:  0.001322,  Adjusted R-squared:  0.001276 
F-statistic: 28.61 on 1 and 21611 DF,  p-value: 8.936e-08
house_remianing_resid <- houses_tidy_numeric %>%
  add_residuals(mod1a) %>%
  select(-c("price", "grade"))

house_remianing_resid %>%
  ggpairs(aes(colour = condition, alpha = 0.5))
Error in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm,  : 
  `mapping` color column must be categorical, not numeric


par(mfrow = c(2,2))
plot(mod2a)

anova(mod1a, mod2a)
Analysis of Variance Table

Model 1: price ~ grade
Model 2: price ~ condition
  Res.Df        RSS Df   Sum of Sq F Pr(>F)
1  21611 1.6153e+15                        
2  21611 2.9091e+15  0 -1.2938e+15         

mod1b <- lm(price ~ long, data = houses_tidy_numeric)

mod1b

Call:
lm(formula = price ~ long, data = houses_tidy_numeric)

Coefficients:
(Intercept)         long  
    7430229        56378  
summary(mod1b)

Call:
lm(formula = price ~ long, data = houses_tidy_numeric)

Residuals:
    Min      1Q  Median      3Q     Max 
-482447 -218209  -88758  102766 7166063 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7430229    2166748   3.429 0.000606 ***
long           56378      17729   3.180 0.001475 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 367000 on 21611 degrees of freedom
Multiple R-squared:  0.0004677, Adjusted R-squared:  0.0004214 
F-statistic: 10.11 on 1 and 21611 DF,  p-value: 0.001475

CODECLAN SOLUTION

houses_tidy_numeric <- houses_tidy %>%
  select_if(is.numeric)

houses_tidy_nonnumeric <- houses_tidy %>%
  select_if(function(x) !is.numeric(x))

houses_tidy_nonnumeric$price <- houses_tidy$price

ggpairs(houses_tidy_numeric)

ggpairs(houses_tidy_nonnumeric)

Correlation of sqft_above with price looks pretty promising, but split of price by grade and waterfront also look decent.

houses_tidy %>%
  ggplot(aes(x = grade, y = price)) +
  geom_boxplot()

houses_tidy %>%
  ggplot(aes(x = waterfront, y = price)) +
  geom_boxplot()

mod1_a <- lm(price ~ sqft_above, data = houses_tidy)
summary(mod1_a)

Call:
lm(formula = price ~ sqft_above, data = houses_tidy)

Residuals:
    Min      1Q  Median      3Q     Max 
-913132 -165624  -41468  109327 5339232 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  59953.2     4729.8   12.68   <2e-16 ***
sqft_above     268.5        2.4  111.87   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 292200 on 21611 degrees of freedom
Multiple R-squared:  0.3667,    Adjusted R-squared:  0.3667 
F-statistic: 1.251e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
mod1_b <- lm(price ~ grade, data = houses_tidy)
summary(mod1_b)

Call:
lm(formula = price ~ grade, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1929615  -135853   -35090    89080  5565658 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   142000     254499   0.558 0.576878    
grade3         63667     293870   0.217 0.828484    
grade4         72381     258849   0.280 0.779767    
grade5        106524     255024   0.418 0.676169    
grade6        159920     254561   0.628 0.529868    
grade7        260590     254513   1.024 0.305904    
grade8        400853     254520   1.575 0.115285    
grade9        631513     254547   2.481 0.013112 *  
grade10       929771     254611   3.652 0.000261 ***
grade11      1354842     254817   5.317 1.07e-07 ***
grade12      2049222     255909   8.008 1.23e-15 ***
grade13      3567615     264106  13.508  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 254500 on 21601 degrees of freedom
Multiple R-squared:  0.5197,    Adjusted R-squared:  0.5195 
F-statistic:  2125 on 11 and 21601 DF,  p-value: < 2.2e-16
mod1_c <- lm(price ~ waterfront, data = houses_tidy)
summary(mod1_c)

Call:
lm(formula = price ~ waterfront, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1376876  -211564   -81564   108436  7168436 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      531564       2416  220.00   <2e-16 ***
waterfrontTRUE  1130312      27822   40.63   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 353900 on 21611 degrees of freedom
Multiple R-squared:  0.07095,   Adjusted R-squared:  0.07091 
F-statistic:  1650 on 1 and 21611 DF,  p-value: < 2.2e-16
# grade looks the most promising, but some of the grade level coeffs are insignificant.
# the F-test at the bottom of the regression output tests against the null model (i.e. intercept only)
# but, if we want, we can replicate this using a separate anova
# null model: regress price on intercept only
null_model <- lm(price ~ 1, data = houses_tidy)
grade_model <- lm(price ~ grade, data = houses_tidy)
anova(null_model, grade_model)
Analysis of Variance Table

Model 1: price ~ 1
Model 2: price ~ grade
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1  21612 2.9129e+15                                   
2  21601 1.3991e+15 11 1.5138e+15 2124.8 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# grade is significant, let's keep it. Now plot diagnostics
par(mfrow = c(2, 2))
plot(mod1_b)
not plotting observations with leverage one:
  19453

houses_resid <- houses_tidy %>%
  add_residuals(mod1b) %>%
  select(-c("price", "grade"))

houses_resid_numeric <- houses_resid %>%
  select_if(is.numeric)

houses_resid_nonnumeric <- houses_resid %>%
  select_if(function(x) !is.numeric(x))

houses_resid_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_resid_numeric)

ggpairs(houses_resid_nonnumeric)

lat has highest correlation with residuals, but, again, waterfront still looks pretty promising. Try both…

mod2_a <- lm(price ~ grade + lat, data = houses_tidy)
summary(mod2_a)

Call:
lm(formula = price ~ grade + lat, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1863783  -112379   -28181    67454  5533910 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -29949769     610666 -49.044  < 2e-16 ***
grade3         187923     276126   0.681 0.496151    
grade4          95367     243212   0.392 0.694978    
grade5         126354     239618   0.527 0.597980    
grade6         158453     239183   0.662 0.507673    
grade7         246275     239138   1.030 0.303093    
grade8         378635     239144   1.583 0.113369    
grade9         603010     239170   2.521 0.011701 *  
grade10        891752     239230   3.728 0.000194 ***
grade11       1311124     239425   5.476  4.4e-08 ***
grade12       2007093     240450   8.347  < 2e-16 ***
grade13       3502099     248154  14.113  < 2e-16 ***
lat            633100      11822  53.554  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 239100 on 21600 degrees of freedom
Multiple R-squared:  0.576, Adjusted R-squared:  0.5758 
F-statistic:  2445 on 12 and 21600 DF,  p-value: < 2.2e-16
mod2_b <- lm(price ~ grade + waterfront, data = houses_tidy)
summary(mod2_b)

Call:
lm(formula = price ~ grade + waterfront, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1929615  -132969   -32393    91481  4778940 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      142000     244366   0.581   0.5612    
grade3            63667     282169   0.226   0.8215    
grade4            72381     248543   0.291   0.7709    
grade5            92834     244870   0.379   0.7046    
grade6           155043     244426   0.634   0.5259    
grade7           258469     244379   1.058   0.2902    
grade8           395393     244386   1.618   0.1057    
grade9           623595     244413   2.551   0.0107 *  
grade10          909321     244474   3.719   0.0002 ***
grade11         1313326     244674   5.368 8.06e-08 ***
grade12         1947993     245731   7.927 2.35e-15 ***
grade13         3567615     253590  14.068  < 2e-16 ***
waterfrontTRUE   828234      19363  42.773  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 244400 on 21600 degrees of freedom
Multiple R-squared:  0.5572,    Adjusted R-squared:  0.557 
F-statistic:  2265 on 12 and 21600 DF,  p-value: < 2.2e-16
# lat is significant and higher r^2, let's keep model2a
par(mfrow = c(2, 2))
plot(mod2_a)
not plotting observations with leverage one:
  19453

houses_resid <- houses_tidy %>%
  add_residuals(mod2_a) %>%
  select(-c("price", "grade", "lat"))

houses_resid_numeric <- houses_resid %>%
  select_if(is.numeric)

houses_resid_nonnumeric <- houses_resid %>%
  select_if(function(x) !is.numeric(x))

houses_resid_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_resid_numeric)

ggpairs(houses_resid_nonnumeric)

Now view has strongest correlation with residuals, but also compare against model with waterfront.

mod3_a <- lm(price ~ grade + lat + view, data = houses_tidy)
summary(mod3_a)

Call:
lm(formula = price ~ grade + lat + view, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1665265  -105866   -21393    69623  5429667 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -30514024     576750 -52.907  < 2e-16 ***
grade3         190253     260743   0.730 0.465607    
grade4          81058     229663   0.353 0.724133    
grade5         112154     226269   0.496 0.620134    
grade6         148515     225858   0.658 0.510827    
grade7         235345     225815   1.042 0.297328    
grade8         351873     225822   1.558 0.119203    
grade9         556584     225848   2.464 0.013731 *  
grade10        821118     225907   3.635 0.000279 ***
grade11       1200228     226097   5.308 1.12e-07 ***
grade12       1832950     227080   8.072 7.28e-16 ***
grade13       3303587     234361  14.096  < 2e-16 ***
lat            644972      11166  57.764  < 2e-16 ***
view           106862       2086  51.234  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 225800 on 21599 degrees of freedom
Multiple R-squared:  0.6219,    Adjusted R-squared:  0.6217 
F-statistic:  2733 on 13 and 21599 DF,  p-value: < 2.2e-16
mod3_b <- lm(price ~ grade + lat + waterfront, data = houses_tidy)
summary(mod3_b)

Call:
lm(formula = price ~ grade + lat + waterfront, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1862555  -109244   -24781    70145  4724767 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -30511083     581588 -52.462  < 2e-16 ***
grade3            190241     262923   0.724 0.469343    
grade4             95796     231583   0.414 0.679130    
grade5            112654     228161   0.494 0.621488    
grade6            153414     227746   0.674 0.500562    
grade7            243828     227703   1.071 0.284264    
grade8            372609     227709   1.636 0.101783    
grade9            594340     227734   2.610 0.009066 ** 
grade10           870026     227792   3.819 0.000134 ***
grade11          1267641     227979   5.560 2.72e-08 ***
grade12          1902270     228964   8.308  < 2e-16 ***
grade13          3500877     236288  14.816  < 2e-16 ***
lat               644910      11259  57.278  < 2e-16 ***
waterfrontTRUE    851219      18046  47.168  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 227700 on 21599 degrees of freedom
Multiple R-squared:  0.6156,    Adjusted R-squared:  0.6154 
F-statistic:  2661 on 13 and 21599 DF,  p-value: < 2.2e-16
# view model is best, keep mod3a
par(mfrow = c(2, 2))
plot(mod3_a)
not plotting observations with leverage one:
  19453

houses_resid <- houses_tidy %>%
  add_residuals(mod3_a) %>%
  select(-c("price", "grade", "lat", "view"))

houses_resid_numeric <- houses_resid %>%
  select_if(is.numeric)

houses_resid_nonnumeric <- houses_resid %>%
  select_if(function(x) !is.numeric(x))

houses_resid_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_resid_numeric)

ggpairs(houses_resid_nonnumeric)

sqft_basement has highest correlation with residuals. Let’s test against all remaining categorical predictors:

mod4_a <- lm(price ~ grade + lat + view + sqft_basement, data = houses_tidy)
summary(mod4_a)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1569582  -105308   -18663    70867  5230039 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -2.875e+07  5.647e+05 -50.913  < 2e-16 ***
grade3         1.830e+05  2.542e+05   0.720  0.47164    
grade4         8.134e+04  2.239e+05   0.363  0.71638    
grade5         1.088e+05  2.206e+05   0.493  0.62197    
grade6         1.355e+05  2.202e+05   0.615  0.53841    
grade7         2.042e+05  2.201e+05   0.927  0.35373    
grade8         3.194e+05  2.201e+05   1.451  0.14687    
grade9         5.281e+05  2.202e+05   2.399  0.01646 *  
grade10        7.859e+05  2.202e+05   3.568  0.00036 ***
grade11        1.156e+06  2.204e+05   5.244 1.59e-07 ***
grade12        1.765e+06  2.214e+05   7.974 1.61e-15 ***
grade13        3.169e+06  2.285e+05  13.869  < 2e-16 ***
lat            6.079e+05  1.094e+04  55.559  < 2e-16 ***
view           8.903e+04  2.101e+03  42.367  < 2e-16 ***
sqft_basement  1.204e+02  3.582e+00  33.600  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 220100 on 21598 degrees of freedom
Multiple R-squared:  0.6407,    Adjusted R-squared:  0.6405 
F-statistic:  2751 on 14 and 21598 DF,  p-value: < 2.2e-16
mod4_b <- lm(price ~ grade + lat + view + waterfront, data = houses_tidy)
summary(mod4_b)

Call:
lm(formula = price ~ grade + lat + view + waterfront, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1713543  -105538   -20871    69842  4901827 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -30758475     564698 -54.469  < 2e-16 ***
grade3            191262     255269   0.749 0.453709    
grade4             84892     224841   0.378 0.705758    
grade5            106294     221518   0.480 0.631345    
grade6            147526     221116   0.667 0.504659    
grade7            236375     221074   1.069 0.284985    
grade8            354372     221081   1.603 0.108970    
grade9            562139     221106   2.542 0.011016 *  
grade10           823728     221164   3.725 0.000196 ***
grade11          1197915     221350   5.412 6.30e-08 ***
grade12          1804315     222314   8.116 5.07e-16 ***
grade13          3351868     229446  14.609  < 2e-16 ***
lat               650115      10932  59.466  < 2e-16 ***
view               80422       2217  36.273  < 2e-16 ***
waterfrontTRUE    582422      19024  30.615  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 221100 on 21598 degrees of freedom
Multiple R-squared:  0.6377,    Adjusted R-squared:  0.6374 
F-statistic:  2715 on 14 and 21598 DF,  p-value: < 2.2e-16
mod4_c <- lm(price ~ grade + lat + view + condition, data = houses_tidy)
summary(mod4_c)

Call:
lm(formula = price ~ grade + lat + view + condition, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1665132  -105472   -18392    71578  5447648 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.043e+07  5.673e+05 -53.640  < 2e-16 ***
grade3       1.472e+05  2.595e+05   0.567 0.570486    
grade4       6.481e+04  2.291e+05   0.283 0.777293    
grade5       7.656e+04  2.258e+05   0.339 0.734605    
grade6       1.165e+05  2.257e+05   0.516 0.605628    
grade7       2.090e+05  2.257e+05   0.926 0.354406    
grade8       3.359e+05  2.257e+05   1.488 0.136720    
grade9       5.476e+05  2.257e+05   2.426 0.015270 *  
grade10      8.154e+05  2.258e+05   3.612 0.000305 ***
grade11      1.199e+06  2.260e+05   5.307 1.12e-07 ***
grade12      1.834e+06  2.269e+05   8.082 6.69e-16 ***
grade13      3.310e+06  2.339e+05  14.148  < 2e-16 ***
lat          6.432e+05  1.098e+04  58.552  < 2e-16 ***
view         1.017e+05  2.058e+03  49.425  < 2e-16 ***
condition2  -4.275e+02  4.466e+04  -0.010 0.992363    
condition3  -6.453e+03  4.154e+04  -0.155 0.876556    
condition4   5.758e+04  4.158e+04   1.385 0.166054    
condition5   1.349e+05  4.180e+04   3.226 0.001258 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 221800 on 21595 degrees of freedom
Multiple R-squared:  0.6352,    Adjusted R-squared:  0.635 
F-statistic:  2212 on 17 and 21595 DF,  p-value: < 2.2e-16
mod4_d <- lm(price ~ grade + lat + view + renovated, data = houses_tidy)
summary(mod4_d)

Call:
lm(formula = price ~ grade + lat + view + renovated, data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1637628  -105101   -19410    70577  5280904 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -30179084     571198 -52.835  < 2e-16 ***
grade3           188870     258132   0.732 0.464372    
grade4            75932     227363   0.334 0.738407    
grade5           108602     224003   0.485 0.627808    
grade6           141362     223596   0.632 0.527250    
grade7           229992     223554   1.029 0.303586    
grade8           346281     223561   1.549 0.121413    
grade9           550447     223587   2.462 0.013828 *  
grade10          817501     223645   3.655 0.000257 ***
grade11         1199432     223833   5.359 8.47e-08 ***
grade12         1835525     224806   8.165 3.39e-16 ***
grade13         3275946     232018  14.119  < 2e-16 ***
lat              637925      11059  57.684  < 2e-16 ***
view             102285       2076  49.261  < 2e-16 ***
renovatedTRUE    159554       7606  20.979  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 223500 on 21598 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6292 
F-statistic:  2621 on 14 and 21598 DF,  p-value: < 2.2e-16
# looks like model with sqft_basement is best, keep mod4a
par(mfrow = c(2, 2))
plot(mod4_a)
not plotting observations with leverage one:
  19453

houses_resid <- houses_tidy %>%
  add_residuals(mod4_a) %>%
  select(- price)

Our final model in terms of main effects is: price ~ grade + lat + view + sqft_basement

EXTENSION

Consider possible interactions between your four main effect predictors and test their effect upon r2. Choose your best candidate interaction and visualise its effect.

Calculate the relative importance of predictors from your best 4-predictor model (i.e. the model without an interaction). Which predictor affects price most strongly?

Now, for interactions, have six possibilities that obey principle of strong hierarchy (i.e. consider including an interaction only if its main effects are already present in the model)

mod5_a <- lm(price ~ grade + lat + view + sqft_basement + grade:lat, data = houses_tidy)
summary(mod5_a)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement + grade:lat, 
    data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1496592  -101302   -20837    67958  5196329 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -4.375e+07  4.424e+07  -0.989   0.3228    
grade3         3.068e+07  6.120e+07   0.501   0.6162    
grade4         3.760e+07  4.798e+07   0.784   0.4332    
grade5         2.917e+07  4.457e+07   0.654   0.5129    
grade6         2.420e+07  4.428e+07   0.546   0.5848    
grade7         1.912e+07  4.425e+07   0.432   0.6656    
grade8         1.546e+07  4.425e+07   0.349   0.7267    
grade9        -7.530e+05  4.427e+07  -0.017   0.9864    
grade10       -1.492e+07  4.435e+07  -0.336   0.7366    
grade11       -1.469e+07  4.466e+07  -0.329   0.7422    
grade12       -8.399e+07  4.632e+07  -1.813   0.0698 .  
grade13        3.135e+06  2.463e+05  12.728   <2e-16 ***
lat            9.234e+05  9.308e+05   0.992   0.3212    
view           8.949e+04  2.086e+03  42.892   <2e-16 ***
sqft_basement  1.208e+02  3.556e+00  33.968   <2e-16 ***
grade3:lat    -6.429e+05  1.290e+06  -0.498   0.6183    
grade4:lat    -7.898e+05  1.010e+06  -0.782   0.4340    
grade5:lat    -6.115e+05  9.378e+05  -0.652   0.5143    
grade6:lat    -5.062e+05  9.315e+05  -0.543   0.5869    
grade7:lat    -3.980e+05  9.310e+05  -0.428   0.6690    
grade8:lat    -3.186e+05  9.310e+05  -0.342   0.7322    
grade9:lat     2.662e+04  9.315e+05   0.029   0.9772    
grade10:lat    3.296e+05  9.330e+05   0.353   0.7239    
grade11:lat    3.325e+05  9.395e+05   0.354   0.7234    
grade12:lat    1.801e+06  9.745e+05   1.848   0.0646 .  
grade13:lat           NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 218400 on 21588 degrees of freedom
Multiple R-squared:  0.6464,    Adjusted R-squared:  0.646 
F-statistic:  1644 on 24 and 21588 DF,  p-value: < 2.2e-16
mod5_b <- lm(price ~ grade + lat + view + sqft_basement + grade:view, data = houses_tidy)
summary(mod5_b)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement + grade:view, 
    data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1878838  -104533   -20264    69483  5165978 

Coefficients: (2 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -2.871e+07  5.606e+05 -51.208  < 2e-16 ***
grade3         1.828e+05  2.523e+05   0.725 0.468734    
grade4         8.617e+04  2.226e+05   0.387 0.698645    
grade5         1.122e+05  2.189e+05   0.513 0.608291    
grade6         1.379e+05  2.185e+05   0.631 0.528067    
grade7         2.081e+05  2.185e+05   0.952 0.340866    
grade8         3.222e+05  2.185e+05   1.475 0.140313    
grade9         5.373e+05  2.185e+05   2.459 0.013953 *  
grade10        7.518e+05  2.186e+05   3.439 0.000585 ***
grade11        1.084e+06  2.189e+05   4.953 7.35e-07 ***
grade12        1.741e+06  2.208e+05   7.887 3.23e-15 ***
grade13        2.594e+06  2.372e+05  10.939  < 2e-16 ***
lat            6.070e+05  1.086e+04  55.879  < 2e-16 ***
view           4.024e+05  3.764e+04  10.689  < 2e-16 ***
sqft_basement  1.178e+02  3.570e+00  32.987  < 2e-16 ***
grade3:view           NA         NA      NA       NA    
grade4:view   -3.485e+05  1.009e+05  -3.454 0.000553 ***
grade5:view   -3.381e+05  4.384e+04  -7.712 1.29e-14 ***
grade6:view   -3.360e+05  3.893e+04  -8.631  < 2e-16 ***
grade7:view   -3.454e+05  3.794e+04  -9.104  < 2e-16 ***
grade8:view   -3.214e+05  3.782e+04  -8.498  < 2e-16 ***
grade9:view   -3.326e+05  3.788e+04  -8.782  < 2e-16 ***
grade10:view  -2.595e+05  3.800e+04  -6.828 8.81e-12 ***
grade11:view  -2.424e+05  3.843e+04  -6.307 2.89e-10 ***
grade12:view  -2.972e+05  4.002e+04  -7.428 1.14e-13 ***
grade13:view          NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 218500 on 21589 degrees of freedom
Multiple R-squared:  0.6462,    Adjusted R-squared:  0.6459 
F-statistic:  1715 on 23 and 21589 DF,  p-value: < 2.2e-16
mod5_c <- lm(price ~ grade + lat + view + sqft_basement + grade:sqft_basement, data = houses_tidy)
summary(mod5_c)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement + grade:sqft_basement, 
    data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-2093544  -102714   -19903    68417  4922552 

Coefficients: (2 not defined because of singularities)
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -2.922e+07  5.576e+05 -52.398  < 2e-16 ***
grade3                 1.849e+05  2.507e+05   0.737 0.460910    
grade4                 7.737e+04  2.210e+05   0.350 0.726274    
grade5                 1.098e+05  2.176e+05   0.504 0.613935    
grade6                 1.427e+05  2.172e+05   0.657 0.511185    
grade7                 2.183e+05  2.172e+05   1.005 0.314862    
grade8                 3.305e+05  2.172e+05   1.522 0.128016    
grade9                 5.195e+05  2.172e+05   2.392 0.016782 *  
grade10                7.595e+05  2.173e+05   3.495 0.000475 ***
grade11                1.067e+06  2.176e+05   4.906 9.38e-07 ***
grade12                1.603e+06  2.191e+05   7.318 2.61e-13 ***
grade13                2.047e+06  2.372e+05   8.629  < 2e-16 ***
lat                    6.177e+05  1.080e+04  57.166  < 2e-16 ***
view                   8.401e+04  2.100e+03  40.011  < 2e-16 ***
sqft_basement          9.153e+02  5.204e+01  17.589  < 2e-16 ***
grade3:sqft_basement          NA         NA      NA       NA    
grade4:sqft_basement  -6.667e+01  1.106e+03  -0.060 0.951943    
grade5:sqft_basement  -7.957e+02  1.065e+02  -7.470 8.33e-14 ***
grade6:sqft_basement  -8.503e+02  5.511e+01 -15.429  < 2e-16 ***
grade7:sqft_basement  -8.442e+02  5.238e+01 -16.118  < 2e-16 ***
grade8:sqft_basement  -8.273e+02  5.241e+01 -15.785  < 2e-16 ***
grade9:sqft_basement  -7.618e+02  5.272e+01 -14.451  < 2e-16 ***
grade10:sqft_basement -7.237e+02  5.317e+01 -13.611  < 2e-16 ***
grade11:sqft_basement -6.234e+02  5.421e+01 -11.499  < 2e-16 ***
grade12:sqft_basement -5.887e+02  5.646e+01 -10.427  < 2e-16 ***
grade13:sqft_basement         NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 217100 on 21589 degrees of freedom
Multiple R-squared:  0.6505,    Adjusted R-squared:  0.6502 
F-statistic:  1747 on 23 and 21589 DF,  p-value: < 2.2e-16
mod5_d <- lm(price ~ grade + lat + view + sqft_basement + lat:view, data = houses_tidy)
summary(mod5_d)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement + lat:view, 
    data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1544431  -103361   -20451    68950  5201833 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -2.680e+07  5.780e+05 -46.362  < 2e-16 ***
grade3         1.749e+05  2.530e+05   0.691 0.489328    
grade4         7.915e+04  2.228e+05   0.355 0.722429    
grade5         1.100e+05  2.195e+05   0.501 0.616167    
grade6         1.372e+05  2.191e+05   0.626 0.531099    
grade7         2.062e+05  2.191e+05   0.941 0.346529    
grade8         3.209e+05  2.191e+05   1.464 0.143085    
grade9         5.296e+05  2.191e+05   2.417 0.015660 *  
grade10        7.858e+05  2.192e+05   3.585 0.000338 ***
grade11        1.152e+06  2.194e+05   5.251 1.53e-07 ***
grade12        1.757e+06  2.203e+05   7.974 1.61e-15 ***
grade13        3.133e+06  2.274e+05  13.776  < 2e-16 ***
lat            5.668e+05  1.125e+04  50.366  < 2e-16 ***
view          -1.069e+07  7.448e+05 -14.352  < 2e-16 ***
sqft_basement  1.188e+02  3.567e+00  33.320  < 2e-16 ***
lat:view       2.266e+05  1.566e+04  14.471  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 219100 on 21597 degrees of freedom
Multiple R-squared:  0.6442,    Adjusted R-squared:  0.6439 
F-statistic:  2607 on 15 and 21597 DF,  p-value: < 2.2e-16
mod5_e <- lm(price ~ grade + lat + view + sqft_basement + lat:sqft_basement, data = houses_tidy)
summary(mod5_e)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement + lat:sqft_basement, 
    data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1550147  -101737   -18763    68303  5174888 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -2.488e+07  6.359e+05 -39.124  < 2e-16 ***
grade3             1.670e+05  2.532e+05   0.659 0.509600    
grade4             7.862e+04  2.230e+05   0.353 0.724442    
grade5             1.072e+05  2.197e+05   0.488 0.625492    
grade6             1.362e+05  2.193e+05   0.621 0.534705    
grade7             2.060e+05  2.193e+05   0.939 0.347638    
grade8             3.210e+05  2.193e+05   1.464 0.143228    
grade9             5.299e+05  2.193e+05   2.416 0.015700 *  
grade10            7.885e+05  2.194e+05   3.594 0.000326 ***
grade11            1.158e+06  2.196e+05   5.274 1.35e-07 ***
grade12            1.765e+06  2.205e+05   8.004 1.26e-15 ***
grade13            3.143e+06  2.276e+05  13.809  < 2e-16 ***
lat                5.264e+05  1.256e+04  41.918  < 2e-16 ***
view               8.911e+04  2.093e+03  42.569  < 2e-16 ***
sqft_basement     -1.698e+04  1.310e+03 -12.968  < 2e-16 ***
lat:sqft_basement  3.595e+02  2.752e+01  13.060  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 219300 on 21597 degrees of freedom
Multiple R-squared:  0.6435,    Adjusted R-squared:  0.6433 
F-statistic:  2599 on 15 and 21597 DF,  p-value: < 2.2e-16
mod5_f <- lm(price ~ grade + lat + view + sqft_basement + view:sqft_basement, data = houses_tidy)
summary(mod5_f)

Call:
lm(formula = price ~ grade + lat + view + sqft_basement + view:sqft_basement, 
    data = houses_tidy)

Residuals:
     Min       1Q   Median       3Q      Max 
-1638784  -104221   -19570    69709  5166209 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -2.883e+07  5.636e+05 -51.155  < 2e-16 ***
grade3              1.833e+05  2.537e+05   0.723 0.469931    
grade4              8.399e+04  2.234e+05   0.376 0.707022    
grade5              1.118e+05  2.201e+05   0.508 0.611432    
grade6              1.385e+05  2.197e+05   0.630 0.528579    
grade7              2.087e+05  2.197e+05   0.950 0.342139    
grade8              3.238e+05  2.197e+05   1.474 0.140599    
grade9              5.312e+05  2.197e+05   2.417 0.015645 *  
grade10             7.884e+05  2.198e+05   3.587 0.000335 ***
grade11             1.155e+06  2.200e+05   5.250 1.54e-07 ***
grade12             1.742e+06  2.210e+05   7.886 3.27e-15 ***
grade13             3.138e+06  2.281e+05  13.756  < 2e-16 ***
lat                 6.096e+05  1.092e+04  55.822  < 2e-16 ***
view                7.106e+04  2.847e+03  24.962  < 2e-16 ***
sqft_basement       1.056e+02  3.910e+00  27.003  < 2e-16 ***
view:sqft_basement  2.870e+01  3.073e+00   9.339  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 219700 on 21597 degrees of freedom
Multiple R-squared:  0.6422,    Adjusted R-squared:  0.6419 
F-statistic:  2584 on 15 and 21597 DF,  p-value: < 2.2e-16
# mod5c looks like the best
par(mfrow = c(2,2))
plot(mod5_c)
not plotting observations with leverage one:
  8620, 19453

It seems that the grade:sqft_basement interaction leads to highest r2 (but two levels of the interaction cannot be determined due to fitting problems).

Now let’s see a visualisation of the effect of this interaction.

houses_resid %>%
  ggplot(aes(x = sqft_basement, y = resid, colour = grade)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ grade)

Relative importance of predictors:

library(relaimpo)
Loading required package: MASS
package ‘MASS’ was built under R version 3.6.2
Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select

Loading required package: boot
Loading required package: survey
package ‘survey’ was built under R version 3.6.2Loading required package: grid
Loading required package: Matrix

Attaching package: ‘Matrix’

The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack

Loading required package: survival

Attaching package: ‘survival’

The following object is masked from ‘package:boot’:

    aml


Attaching package: ‘survey’

The following object is masked from ‘package:graphics’:

    dotchart

Loading required package: mitools
This is the global version of package relaimpo.

If you are a non-US user, a version with the interesting additional metric pmvd is available

from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(mod4_a, method = "lmg", rela = TRUE)
Response variable: price 
Total response variance: 134782378397 
Analysis based on 21613 observations 

14 Regressors: 
Some regressors combined in groups: 
        Group  grade : grade3 grade4 grade5 grade6 grade7 grade8 grade9 grade10 grade11 grade12 grade13 

 Relative importance of 4 (groups of) regressors assessed: 
 grade lat view sqft_basement 
 
Proportion of variance explained by model: 64.07%
Metrics are normalized to sum to 100% (rela=TRUE). 

Relative importance metrics: 

                     lmg
grade         0.67013680
lat           0.10999806
view          0.13594911
sqft_basement 0.08391603

Average coefficients for different model sizes: 

                    1group      2groups      3groups      4groups
grade3          63666.6672  105085.5034  144251.4794  182970.8245
grade4          72381.0351   74840.3850   77838.0192   81340.5229
grade5         106523.9716  106120.0182  106860.8796  108762.8919
grade6         159919.6380  148997.5902  140821.1511  135460.9340
grade7         260590.2629  235879.0272  216962.5200  204157.6486
grade8         400852.7662  366263.3497  339139.6632  319376.1318
grade9         631513.1864  588676.4480  554462.1758  528149.5236
grade10        929771.0746  870400.5336  822843.4234  785879.9257
grade11       1354841.7274 1272639.8932 1206980.5470 1155890.1732
grade12       2049222.0006 1930531.4478 1836851.8205 1765372.6480
grade13       3567615.3852 3398156.8184 3266023.2283 3169155.3827
lat            813411.5832  722508.3996  660525.2637  607867.6492
view           190335.2479  151137.4701  117930.1234   89031.6774
sqft_basement     268.6136     203.8056     154.5114     120.3599

It looks like the grade of property is the most important determiner of price, followed by the number of views the property has received.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCgpgYGB7cn0KCmhvdXNlX2RhdGEgPC0gcmVhZF9jc3YoImRhdGEva2NfaG91c2VfZGF0YS5jc3YiKQoKYGBgCgpDT0RFQ0xBTiBTT0lMVVRJT04KCmBgYHtyfQpob3VzZXMgPC0gcmVhZF9jc3YoImRhdGEva2NfaG91c2VfZGF0YS5jc3YiKQpgYGAKCmBgYHtyfQpnbGltcHNlKGhvdXNlcykKYGBgCgoKYGBge3J9CmxpYnJhcnkobW9kZWxyKQpgYGAKCgoKYGBge3J9CmhvdXNlX2RhdGEKYGBgCgoKVGlkeSB1cCB0aGUgZGF0YSByZWFkeSBmb3IgcmVncmVzc2lvbjoKCllvdSBtaWdodCBsaWtlIHRvIHRoaW5rIGFib3V0IHJlbW92aW5nIHNvbWUgb3IgYWxsIG9mIGRhdGUsIGlkLCBzcWZ0X2xpdmluZzE1LCBzcWZ0X2xvdDE1IGFuZCB6aXBjb2RlIChsYXQgYW5kIGxvbmcgcHJvdmlkZSBhIGJldHRlciBtZWFzdXJlIG9mIGxvY2F0aW9uIGluIGFueSBldmVudCkuCgpgYGB7cn0KaG91c2VfZGF0YV90aWR5IDwtIGhvdXNlX2RhdGEgJT4lCiAgc2VsZWN0KC1jKCJkYXRlIiwgImlkIiwgInNxZnRfbGl2aW5nMTUiLCAic3FmdF9sb3QxNSIsICJ6aXBjb2RlIikpCgpob3VzZV9kYXRhX3RpZHkKYGBgCgpDT0RFQ0xBTiBTT0xVVElPTgoKYGBge3J9CiMgdGlkeSB1cCBkYXRhLiBJbiBwYXJ0aWN1bGFyIHRyZWF0IGNvbmRpdGlvbiBhbmQgZ3JhZGUgYXMgZmFjdG9yLCBhcyB0aGV5IGFyZQojIG9yZGluYWwgY2F0ZWdvcmljYWwKaG91c2VzX3RpZHkgPC0gaG91c2VzICU+JQogIHNlbGVjdCgtYygiaWQiLCAiZGF0ZSIsICJzcWZ0X2xpdmluZzE1IiwgInNxZnRfbG90MTUiLCAiemlwY29kZSIpKSAlPiUKICBtdXRhdGUod2F0ZXJmcm9udCA9IGFzLmxvZ2ljYWwod2F0ZXJmcm9udCkpICU+JQogIG11dGF0ZShyZW5vdmF0ZWQgPSB5cl9yZW5vdmF0ZWQgIT0gMCkgJT4lCiAgc2VsZWN0KC0ieXJfcmVub3ZhdGVkIikgJT4lCiAgbXV0YXRlKGNvbmRpdGlvbiA9IGFzX2ZhY3Rvcihjb25kaXRpb24pKSAlPiUKICBtdXRhdGUoZ3JhZGUgPSBhc19mYWN0b3IoZ3JhZGUpKQoKZ2xpbXBzZShob3VzZXNfdGlkeSkKYGBgCgoKSGF2ZSBhIHRoaW5rIGFib3V0IGhvdyB0byB0cmVhdCB3YXRlcmZyb250LiBTaG91bGQgd2UgY29udmVydCBpdHMgdHlwZT8KCldlIGNvbnZlcnRlZCB5cl9yZW5vdmF0ZWQgaW50byBhIHJlbm92YXRlZCBsb2dpY2FsIHZhcmlhYmxlLCBpbmRpY2F0aW5nIHdoZXRoZXIgdGhlIHByb3BlcnR5IGhhZCBldmVyIGJlZW4gcmVub3ZhdGVkLiBZb3UgbWF5IHdpc2ggdG8gZG8gdGhlIHNhbWUuCgpIYXZlIGEgdGhpbmsgYWJvdXQgaG93IHRvIHRyZWF0IGNvbmRpdGlvbiBhbmQgZ3JhZGU/IEFyZSB0aGV5IGludGVydmFsIG9yIGNhdGVnb3JpY2FsIG9yZGluYWwgZGF0YSB0eXBlcz8KCgpDaGVjayBmb3IgYWxpYXNlZCB2YXJpYWJsZXMgdXNpbmcgdGhlIGFsaWFzKCkgZnVuY3Rpb24gKHRoaXMgdGFrZXMgaW4gYSBmb3JtdWxhIG9iamVjdCBhbmQgYSBkYXRhIHNldCkuIFtIaW50IC0gZm9ybXVsYSBwcmljZSB+IC4gc2F5cyDigJhwcmljZSB2YXJ5aW5nIHdpdGggYWxsIHByZWRpY3RvcnPigJksIHRoaXMgaXMgYSBzdWl0YWJsZSBpbnB1dCB0byBhbGlhcygpXS4gUmVtb3ZlIHZhcmlhYmxlcyB0aGF0IGxlYWQgdG8gYW4gYWxpYXMuIENoZWNrIHRoZSDigJhFbGVtZW50cyBvZiBtdWx0aXBsZSByZWdyZXNzaW9u4oCZIGxlc3NvbiBmb3IgYSBkcm9wZG93biBjb250YWluaW5nIGZ1cnRoZXIgaW5mb3JtYXRpb24gb24gZmluZGluZyBhbGlhc2VkIHZhcmlhYmxlcyBpbiBhIGRhdGFzZXQuCgpgYGB7cn0KbGlicmFyeShtb2RlbHIpCmBgYAoKCmBgYHtyfQphbGlhcyhsbShwcmljZX4gLiwgZGF0YSA9IGhvdXNlX2RhdGFfdGlkeSkpCmBgYAoKYGBge3J9CmhvdXNlX3ByaWNlX3RpZHkgPC0gaG91c2VfZGF0YV90aWR5ICU+JQogIHNlbGVjdCgtYygic3FmdF9saXZpbmciLCAic3FmdF9hYm92ZSIpKQpgYGAKCkNPREVDTEFOIFNPTFVUSU9OCgpgYGB7cn0KIyBBbGlhcyBpcyB1c2VmdWwgdG8gY2hlY2sgaWYgd2UgaGF2ZSBhbGlhc2VkIHZhcmlhYmxlcywgaS5lLiBvbmUgb3IgbW9yZQojIHZhcmlhYmxlcyB0aGF0IGNhbiBiZSBjb21wdXRlZCBmcm9tIG90aGVyIHZhcmlhYmxlcwphbGlhcyhwcmljZSB+IC4sIGRhdGEgPSBob3VzZXNfdGlkeSkKCmBgYAoKYGBge3J9CiMgc2VlbXMgdGhhdCBzcWZ0X2Jhc2VtZW50IGNhbiBiZSBjb21wdXRlZCBmcm9tIHNxZnRfbGl2aW5nIC0gc3FmdF9hYm92ZS4KIyBsZXQncyBkcm9wIHNxZnRfbGl2aW5nIGxlYXZpbmcganVzdCB0aGUgdHdvIGNvbnRyaWJ1dGlvbnMgc3FmdF9iYXNlbWVudCBhbmQgCiMgc3FmdF9hYm92ZQpob3VzZXNfdGlkeSA8LSBob3VzZXNfdGlkeSAlPiUKICBzZWxlY3QoLSJzcWZ0X2xpdmluZyIpCgpnbGltcHNlKGhvdXNlc190aWR5KQpgYGAKCgpTeXN0ZW1hdGljYWxseSBidWlsZCBhIHJlZ3Jlc3Npb24gbW9kZWwgY29udGFpbmluZyB1cCB0byBmb3VyIG1haW4gZWZmZWN0cyAocmVtZW1iZXIsIGEgbWFpbiBlZmZlY3QgaXMganVzdCBhIHNpbmdsZSBwcmVkaWN0b3Igd2l0aCBjb2VmZmljaWVudCksIHRlc3RpbmcgdGhlIHJlZ3Jlc3Npb24gZGlhZ25vc3RpY3MgYXMgeW91IGdvCgpzcGxpdHRpbmcgZGF0YXNldHMgaW50byBudW1lcmljIGFuZCBub24tbnVtZXJpYyBjb2x1bW5zIG1pZ2h0IGhlbHAgZ2dwYWlycygpIHJ1biBpbiBtYW5hZ2VhYmxlIHRpbWUsIGFsdGhvdWdoIHlvdSB3aWxsIG5lZWQgdG8gYWRkIGVpdGhlciBhIHByaWNlIG9yIHJlc2lkIGNvbHVtbiB0byB0aGUgbm9uLW51bWVyaWMgZGF0YWZyYW1lIGluIG9yZGVyIHRvIHNlZSBpdHMgY29ycmVsYXRpb25zIHdpdGggdGhlIG5vbi1udW1lcmljIHByZWRpY3RvcnMuCgpgYGB7cn0KbGlicmFyeShHR2FsbHkpCmBgYAoKCmBgYHtyfQpob3VzZXNfdGlkeV9udW1lcmljIDwtIGhvdXNlX3ByaWNlX3RpZHkgJT4lCiAgc2VsZWN0X2lmKGlzLm51bWVyaWMpCgpob3VzZXNfdGlkeV9ub25udW1lcmljIDwtIGhvdXNlX3ByaWNlX3RpZHkgJT4lCiAgc2VsZWN0X2lmKGZ1bmN0aW9uKHgpICFpcy5udW1lcmljKHgpKQoKaG91c2VzX3RpZHlfbm9ubnVtZXJpYyRwcmljZSA8LSBob3VzZV9wcmljZV90aWR5JHByaWNlCgpnZ3BhaXJzKGhvdXNlc190aWR5X251bWVyaWMpCmdncGFpcnMoaG91c2VzX3RpZHlfbm9ubnVtZXJpYykKYGBgCgoKYGBge3J9Cm1vZDFhIDwtIGxtKHByaWNlIH4gZ3JhZGUsIGRhdGEgPSBob3VzZXNfdGlkeV9udW1lcmljKQoKbW9kMWEKYGBgCgoKYGBge3J9CnN1bW1hcnkobW9kMWEpCmBgYAoKYGBge3J9CnBhcihtZnJvdyA9IGMoMiwyKSkKcGxvdChtb2QxYSkKYGBgCgoKYGBge3J9Cm1vZDJhIDwtIGxtKHByaWNlIH4gY29uZGl0aW9uLCBkYXRhID0gaG91c2VfZGF0YV90aWR5KQoKbW9kMmEKYGBgCgoKYGBge3J9CnN1bW1hcnkobW9kMmEpCmBgYAoKYGBge3J9CmhvdXNlX3JlbWlhbmluZ19yZXNpZCA8LSBob3VzZXNfdGlkeV9udW1lcmljICU+JQogIGFkZF9yZXNpZHVhbHMobW9kMWEpICU+JQogIHNlbGVjdCgtYygicHJpY2UiLCAiZ3JhZGUiKSkKCmhvdXNlX3JlbWlhbmluZ19yZXNpZCAlPiUKICBnZ3BhaXJzKGFlcyhjb2xvdXIgPSBjb25kaXRpb24sIGFscGhhID0gMC41KSkKYGBgCgoKYGBge3J9CgpwYXIobWZyb3cgPSBjKDIsMikpCnBsb3QobW9kMmEpCgpgYGAKCmBgYHtyfQphbm92YShtb2QxYSwgbW9kMmEpCmBgYAoKYGBge3J9Cgptb2QxYiA8LSBsbShwcmljZSB+IGxvbmcsIGRhdGEgPSBob3VzZXNfdGlkeV9udW1lcmljKQoKbW9kMWIKCmBgYAoKCmBgYHtyfQpzdW1tYXJ5KG1vZDFiKQpgYGAKCkNPREVDTEFOIFNPTFVUSU9OCgpgYGB7cn0KaG91c2VzX3RpZHlfbnVtZXJpYyA8LSBob3VzZXNfdGlkeSAlPiUKICBzZWxlY3RfaWYoaXMubnVtZXJpYykKCmhvdXNlc190aWR5X25vbm51bWVyaWMgPC0gaG91c2VzX3RpZHkgJT4lCiAgc2VsZWN0X2lmKGZ1bmN0aW9uKHgpICFpcy5udW1lcmljKHgpKQoKaG91c2VzX3RpZHlfbm9ubnVtZXJpYyRwcmljZSA8LSBob3VzZXNfdGlkeSRwcmljZQoKZ2dwYWlycyhob3VzZXNfdGlkeV9udW1lcmljKQoKYGBgCgpgYGB7cn0KZ2dwYWlycyhob3VzZXNfdGlkeV9ub25udW1lcmljKQpgYGAKCkNvcnJlbGF0aW9uIG9mIHNxZnRfYWJvdmUgd2l0aCBwcmljZSBsb29rcyBwcmV0dHkgcHJvbWlzaW5nLCBidXQgc3BsaXQgb2YgcHJpY2UgYnkgZ3JhZGUgYW5kIHdhdGVyZnJvbnQgYWxzbyBsb29rIGRlY2VudC4KCmBgYHtyfQpob3VzZXNfdGlkeSAlPiUKICBnZ3Bsb3QoYWVzKHggPSBncmFkZSwgeSA9IHByaWNlKSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKYGBge3J9CmhvdXNlc190aWR5ICU+JQogIGdncGxvdChhZXMoeCA9IHdhdGVyZnJvbnQsIHkgPSBwcmljZSkpICsKICBnZW9tX2JveHBsb3QoKQpgYGAKCmBgYHtyfQptb2QxX2EgPC0gbG0ocHJpY2UgfiBzcWZ0X2Fib3ZlLCBkYXRhID0gaG91c2VzX3RpZHkpCnN1bW1hcnkobW9kMV9hKQpgYGAKCmBgYHtyfQptb2QxX2IgPC0gbG0ocHJpY2UgfiBncmFkZSwgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDFfYikKYGBgCgpgYGB7cn0KbW9kMV9jIDwtIGxtKHByaWNlIH4gd2F0ZXJmcm9udCwgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDFfYykKYGBgCgpgYGB7cn0KIyBncmFkZSBsb29rcyB0aGUgbW9zdCBwcm9taXNpbmcsIGJ1dCBzb21lIG9mIHRoZSBncmFkZSBsZXZlbCBjb2VmZnMgYXJlIGluc2lnbmlmaWNhbnQuCiMgdGhlIEYtdGVzdCBhdCB0aGUgYm90dG9tIG9mIHRoZSByZWdyZXNzaW9uIG91dHB1dCB0ZXN0cyBhZ2FpbnN0IHRoZSBudWxsIG1vZGVsIChpLmUuIGludGVyY2VwdCBvbmx5KQojIGJ1dCwgaWYgd2Ugd2FudCwgd2UgY2FuIHJlcGxpY2F0ZSB0aGlzIHVzaW5nIGEgc2VwYXJhdGUgYW5vdmEKIyBudWxsIG1vZGVsOiByZWdyZXNzIHByaWNlIG9uIGludGVyY2VwdCBvbmx5Cm51bGxfbW9kZWwgPC0gbG0ocHJpY2UgfiAxLCBkYXRhID0gaG91c2VzX3RpZHkpCmdyYWRlX21vZGVsIDwtIGxtKHByaWNlIH4gZ3JhZGUsIGRhdGEgPSBob3VzZXNfdGlkeSkKYW5vdmEobnVsbF9tb2RlbCwgZ3JhZGVfbW9kZWwpCmBgYAoKYGBge3J9CiMgZ3JhZGUgaXMgc2lnbmlmaWNhbnQsIGxldCdzIGtlZXAgaXQuIE5vdyBwbG90IGRpYWdub3N0aWNzCnBhcihtZnJvdyA9IGMoMiwgMikpCnBsb3QobW9kMV9iKQpgYGAKCmBgYHtyfQpob3VzZXNfcmVzaWQgPC0gaG91c2VzX3RpZHkgJT4lCiAgYWRkX3Jlc2lkdWFscyhtb2QxYikgJT4lCiAgc2VsZWN0KC1jKCJwcmljZSIsICJncmFkZSIpKQoKaG91c2VzX3Jlc2lkX251bWVyaWMgPC0gaG91c2VzX3Jlc2lkICU+JQogIHNlbGVjdF9pZihpcy5udW1lcmljKQoKaG91c2VzX3Jlc2lkX25vbm51bWVyaWMgPC0gaG91c2VzX3Jlc2lkICU+JQogIHNlbGVjdF9pZihmdW5jdGlvbih4KSAhaXMubnVtZXJpYyh4KSkKCmhvdXNlc19yZXNpZF9ub25udW1lcmljJHJlc2lkIDwtIGhvdXNlc19yZXNpZCRyZXNpZApgYGAKCmBgYHtyfQpnZ3BhaXJzKGhvdXNlc19yZXNpZF9udW1lcmljKQpgYGAKCgpgYGB7cn0KZ2dwYWlycyhob3VzZXNfcmVzaWRfbm9ubnVtZXJpYykKYGBgCgpsYXQgaGFzIGhpZ2hlc3QgY29ycmVsYXRpb24gd2l0aCByZXNpZHVhbHMsIGJ1dCwgYWdhaW4sIHdhdGVyZnJvbnQgc3RpbGwgbG9va3MgcHJldHR5IHByb21pc2luZy4gVHJ5IGJvdGjigKYKCmBgYHtyfQptb2QyX2EgPC0gbG0ocHJpY2UgfiBncmFkZSArIGxhdCwgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDJfYSkKYGBgCgpgYGB7cn0KbW9kMl9iIDwtIGxtKHByaWNlIH4gZ3JhZGUgKyB3YXRlcmZyb250LCBkYXRhID0gaG91c2VzX3RpZHkpCnN1bW1hcnkobW9kMl9iKQpgYGAKCmBgYHtyfQojIGxhdCBpcyBzaWduaWZpY2FudCBhbmQgaGlnaGVyIHJeMiwgbGV0J3Mga2VlcCBtb2RlbDJhCnBhcihtZnJvdyA9IGMoMiwgMikpCnBsb3QobW9kMl9hKQpgYGAKCmBgYHtyfQpob3VzZXNfcmVzaWQgPC0gaG91c2VzX3RpZHkgJT4lCiAgYWRkX3Jlc2lkdWFscyhtb2QyX2EpICU+JQogIHNlbGVjdCgtYygicHJpY2UiLCAiZ3JhZGUiLCAibGF0IikpCgpob3VzZXNfcmVzaWRfbnVtZXJpYyA8LSBob3VzZXNfcmVzaWQgJT4lCiAgc2VsZWN0X2lmKGlzLm51bWVyaWMpCgpob3VzZXNfcmVzaWRfbm9ubnVtZXJpYyA8LSBob3VzZXNfcmVzaWQgJT4lCiAgc2VsZWN0X2lmKGZ1bmN0aW9uKHgpICFpcy5udW1lcmljKHgpKQoKaG91c2VzX3Jlc2lkX25vbm51bWVyaWMkcmVzaWQgPC0gaG91c2VzX3Jlc2lkJHJlc2lkCmBgYAoKYGBge3J9CmdncGFpcnMoaG91c2VzX3Jlc2lkX251bWVyaWMpCmBgYAoKYGBge3J9CmdncGFpcnMoaG91c2VzX3Jlc2lkX25vbm51bWVyaWMpCmBgYAoKTm93IHZpZXcgaGFzIHN0cm9uZ2VzdCBjb3JyZWxhdGlvbiB3aXRoIHJlc2lkdWFscywgYnV0IGFsc28gY29tcGFyZSBhZ2FpbnN0IG1vZGVsIHdpdGggd2F0ZXJmcm9udC4KCmBgYHtyfQptb2QzX2EgPC0gbG0ocHJpY2UgfiBncmFkZSArIGxhdCArIHZpZXcsIGRhdGEgPSBob3VzZXNfdGlkeSkKc3VtbWFyeShtb2QzX2EpCmBgYAoKYGBge3J9Cm1vZDNfYiA8LSBsbShwcmljZSB+IGdyYWRlICsgbGF0ICsgd2F0ZXJmcm9udCwgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDNfYikKYGBgCgpgYGB7cn0KIyB2aWV3IG1vZGVsIGlzIGJlc3QsIGtlZXAgbW9kM2EKcGFyKG1mcm93ID0gYygyLCAyKSkKcGxvdChtb2QzX2EpCmBgYAoKYGBge3J9CmhvdXNlc19yZXNpZCA8LSBob3VzZXNfdGlkeSAlPiUKICBhZGRfcmVzaWR1YWxzKG1vZDNfYSkgJT4lCiAgc2VsZWN0KC1jKCJwcmljZSIsICJncmFkZSIsICJsYXQiLCAidmlldyIpKQoKaG91c2VzX3Jlc2lkX251bWVyaWMgPC0gaG91c2VzX3Jlc2lkICU+JQogIHNlbGVjdF9pZihpcy5udW1lcmljKQoKaG91c2VzX3Jlc2lkX25vbm51bWVyaWMgPC0gaG91c2VzX3Jlc2lkICU+JQogIHNlbGVjdF9pZihmdW5jdGlvbih4KSAhaXMubnVtZXJpYyh4KSkKCmhvdXNlc19yZXNpZF9ub25udW1lcmljJHJlc2lkIDwtIGhvdXNlc19yZXNpZCRyZXNpZApgYGAKCmBgYHtyfQpnZ3BhaXJzKGhvdXNlc19yZXNpZF9udW1lcmljKQpgYGAKCgpgYGB7cn0KZ2dwYWlycyhob3VzZXNfcmVzaWRfbm9ubnVtZXJpYykKYGBgCgpzcWZ0X2Jhc2VtZW50IGhhcyBoaWdoZXN0IGNvcnJlbGF0aW9uIHdpdGggcmVzaWR1YWxzLiBMZXTigJlzIHRlc3QgYWdhaW5zdCBhbGwgcmVtYWluaW5nIGNhdGVnb3JpY2FsIHByZWRpY3RvcnM6CgpgYGB7cn0KbW9kNF9hIDwtIGxtKHByaWNlIH4gZ3JhZGUgKyBsYXQgKyB2aWV3ICsgc3FmdF9iYXNlbWVudCwgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDRfYSkKYGBgCgpgYGB7cn0KbW9kNF9iIDwtIGxtKHByaWNlIH4gZ3JhZGUgKyBsYXQgKyB2aWV3ICsgd2F0ZXJmcm9udCwgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDRfYikKYGBgCgpgYGB7cn0KbW9kNF9jIDwtIGxtKHByaWNlIH4gZ3JhZGUgKyBsYXQgKyB2aWV3ICsgY29uZGl0aW9uLCBkYXRhID0gaG91c2VzX3RpZHkpCnN1bW1hcnkobW9kNF9jKQpgYGAKCmBgYHtyfQptb2Q0X2QgPC0gbG0ocHJpY2UgfiBncmFkZSArIGxhdCArIHZpZXcgKyByZW5vdmF0ZWQsIGRhdGEgPSBob3VzZXNfdGlkeSkKc3VtbWFyeShtb2Q0X2QpCmBgYAoKYGBge3J9CiMgbG9va3MgbGlrZSBtb2RlbCB3aXRoIHNxZnRfYmFzZW1lbnQgaXMgYmVzdCwga2VlcCBtb2Q0YQpwYXIobWZyb3cgPSBjKDIsIDIpKQpwbG90KG1vZDRfYSkKYGBgCgpgYGB7cn0KaG91c2VzX3Jlc2lkIDwtIGhvdXNlc190aWR5ICU+JQogIGFkZF9yZXNpZHVhbHMobW9kNF9hKSAlPiUKICBzZWxlY3QoLSBwcmljZSkKYGBgCgpPdXIgZmluYWwgbW9kZWwgaW4gdGVybXMgb2YgbWFpbiBlZmZlY3RzIGlzOiBwcmljZSB+IGdyYWRlICsgbGF0ICsgdmlldyArIHNxZnRfYmFzZW1lbnQKCkVYVEVOU0lPTgoKQ29uc2lkZXIgcG9zc2libGUgaW50ZXJhY3Rpb25zIGJldHdlZW4geW91ciBmb3VyIG1haW4gZWZmZWN0IHByZWRpY3RvcnMgYW5kIHRlc3QgdGhlaXIgZWZmZWN0IHVwb24gcjIuIENob29zZSB5b3VyIGJlc3QgY2FuZGlkYXRlIGludGVyYWN0aW9uIGFuZCB2aXN1YWxpc2UgaXRzIGVmZmVjdC4KCkNhbGN1bGF0ZSB0aGUgcmVsYXRpdmUgaW1wb3J0YW5jZSBvZiBwcmVkaWN0b3JzIGZyb20geW91ciBiZXN0IDQtcHJlZGljdG9yIG1vZGVsIChpLmUuIHRoZSBtb2RlbCB3aXRob3V0IGFuIGludGVyYWN0aW9uKS4gV2hpY2ggcHJlZGljdG9yIGFmZmVjdHMgcHJpY2UgbW9zdCBzdHJvbmdseT8KCk5vdywgZm9yIGludGVyYWN0aW9ucywgaGF2ZSBzaXggcG9zc2liaWxpdGllcyB0aGF0IG9iZXkgcHJpbmNpcGxlIG9mIHN0cm9uZyBoaWVyYXJjaHkgKGkuZS4gY29uc2lkZXIgaW5jbHVkaW5nIGFuIGludGVyYWN0aW9uIG9ubHkgaWYgaXRzIG1haW4gZWZmZWN0cyBhcmUgYWxyZWFkeSBwcmVzZW50IGluIHRoZSBtb2RlbCkKCmBgYHtyfQptb2Q1X2EgPC0gbG0ocHJpY2UgfiBncmFkZSArIGxhdCArIHZpZXcgKyBzcWZ0X2Jhc2VtZW50ICsgZ3JhZGU6bGF0LCBkYXRhID0gaG91c2VzX3RpZHkpCnN1bW1hcnkobW9kNV9hKQpgYGAKCmBgYHtyfQptb2Q1X2IgPC0gbG0ocHJpY2UgfiBncmFkZSArIGxhdCArIHZpZXcgKyBzcWZ0X2Jhc2VtZW50ICsgZ3JhZGU6dmlldywgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDVfYikKYGBgCgpgYGB7cn0KbW9kNV9jIDwtIGxtKHByaWNlIH4gZ3JhZGUgKyBsYXQgKyB2aWV3ICsgc3FmdF9iYXNlbWVudCArIGdyYWRlOnNxZnRfYmFzZW1lbnQsIGRhdGEgPSBob3VzZXNfdGlkeSkKc3VtbWFyeShtb2Q1X2MpCmBgYAoKYGBge3J9Cm1vZDVfZCA8LSBsbShwcmljZSB+IGdyYWRlICsgbGF0ICsgdmlldyArIHNxZnRfYmFzZW1lbnQgKyBsYXQ6dmlldywgZGF0YSA9IGhvdXNlc190aWR5KQpzdW1tYXJ5KG1vZDVfZCkKYGBgCgpgYGB7cn0KbW9kNV9lIDwtIGxtKHByaWNlIH4gZ3JhZGUgKyBsYXQgKyB2aWV3ICsgc3FmdF9iYXNlbWVudCArIGxhdDpzcWZ0X2Jhc2VtZW50LCBkYXRhID0gaG91c2VzX3RpZHkpCnN1bW1hcnkobW9kNV9lKQpgYGAKCmBgYHtyfQptb2Q1X2YgPC0gbG0ocHJpY2UgfiBncmFkZSArIGxhdCArIHZpZXcgKyBzcWZ0X2Jhc2VtZW50ICsgdmlldzpzcWZ0X2Jhc2VtZW50LCBkYXRhID0gaG91c2VzX3RpZHkpCnN1bW1hcnkobW9kNV9mKQpgYGAKCmBgYHtyfQojIG1vZDVjIGxvb2tzIGxpa2UgdGhlIGJlc3QKcGFyKG1mcm93ID0gYygyLDIpKQpwbG90KG1vZDVfYykKYGBgCgpJdCBzZWVtcyB0aGF0IHRoZSBncmFkZTpzcWZ0X2Jhc2VtZW50IGludGVyYWN0aW9uIGxlYWRzIHRvIGhpZ2hlc3QgcjIgKGJ1dCB0d28gbGV2ZWxzIG9mIHRoZSBpbnRlcmFjdGlvbiBjYW5ub3QgYmUgZGV0ZXJtaW5lZCBkdWUgdG8gZml0dGluZyBwcm9ibGVtcykuCgpOb3cgbGV04oCZcyBzZWUgYSB2aXN1YWxpc2F0aW9uIG9mIHRoZSBlZmZlY3Qgb2YgdGhpcyBpbnRlcmFjdGlvbi4KCmBgYHtyfQpob3VzZXNfcmVzaWQgJT4lCiAgZ2dwbG90KGFlcyh4ID0gc3FmdF9iYXNlbWVudCwgeSA9IHJlc2lkLCBjb2xvdXIgPSBncmFkZSkpICsKICBnZW9tX3BvaW50KGFscGhhID0gMC41KSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSkgKwogIGZhY2V0X3dyYXAofiBncmFkZSkKYGBgCgpSZWxhdGl2ZSBpbXBvcnRhbmNlIG9mIHByZWRpY3RvcnM6CgpgYGB7cn0KbGlicmFyeShyZWxhaW1wbykKYGBgCgpgYGB7cn0KY2FsYy5yZWxpbXAobW9kNF9hLCBtZXRob2QgPSAibG1nIiwgcmVsYSA9IFRSVUUpCmBgYAoKSXQgbG9va3MgbGlrZSB0aGUgZ3JhZGUgb2YgcHJvcGVydHkgaXMgdGhlIG1vc3QgaW1wb3J0YW50IGRldGVybWluZXIgb2YgcHJpY2UsIGZvbGxvd2VkIGJ5IHRoZSBudW1iZXIgb2Ygdmlld3MgdGhlIHByb3BlcnR5IGhhcyByZWNlaXZlZC4KCg==